import numpy as np
import sqlite3
import matplotlib.pylab as plt
import datetime as DT
import seaborn as sns
np.set_printoptions(precision=5)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#### import the PCA library from scikit learn library
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D # didn't get to use this
%matplotlib inline
# Normalizing (scaling) the data is VERY important - indeed can be important to many machine
# learning algorithms. Take the original features and scale them so that they all have zero
# mean and unit variance
from sklearn import preprocessing
## pandas actually has a command to read_sql or read_sql_query and return a pandas.DataFrame
## coerce_float=True argument to force float data type
## Need to FIRST connect to the database by creating a connection object called conn.
conn = sqlite3.connect('database.sqlite')
# create a cursor object using the conn object method
# The cursor object has methods for accessing the data
# c = conn.cursor() # this is not needed for the pd.read_sql IO tool
# Get the database table list from information in the sqlite_master table
# Follow convention to type SQL commands in all caps
# preview all tables in the database
print ('======')
print ('Tables in the database')
# Set the execute SQL command, Fetch and print all table names and info, and
# return a pandas DataFrame
df_tables = pd.read_sql("""SELECT * FROM sqlite_master WHERE type='table';""", conn)
print('df_tables shape: ', df_tables.shape)
print(df_tables)
print ('======')
print('Player_Attributes table:')
print(df_tables.sql[1]) # get sql that CREATE the Player_Atrribtues table
print ('======')
print('Player table:')
print(df_tables.sql[2]) # get sql that CREATE the Player table
print ('======')
print ('Player table')
df_Player = pd.read_sql("""SELECT * FROM Player """, conn)
print('df_Player.shape:', df_Player.shape)
print(df_Player.columns)
print(df_Player.head())
print ('======')
print ('Player_Attributes table')
df_Player_Attributes = pd.read_sql("""SELECT * FROM Player_Attributes""", conn)
print('df_Player_Attributes.shape:', df_Player_Attributes.shape)
print(df_Player_Attributes.columns)
print(df_Player_Attributes.head())
print ('======')
# acquire data from database using pd.read_sql_query(sql, , ,)
# build SQL to SELECT all columns from both Player and Player_Attributes tables
# for rows reocrds w/ matching player_fifa_api_id
sql="SELECT * FROM Player INNER JOIN Player_Attributes ON Player.player_fifa_api_id=Player_Attributes.player_fifa_api_id;"
df_all_col=pd.read_sql_query(sql, conn, coerce_float=True, params=None, parse_dates=['birthday','date'], chunksize=None)
# calculate age of player at the time attributes were collected
df_all_col['age'] = (df_all_col.date - df_all_col.birthday).astype('timedelta64[Y]')
#Tally total score per player attribute category
df_all_col['total_attack'] = df_all_col.crossing + df_all_col.finishing + df_all_col.heading_accuracy + \
df_all_col.short_passing + df_all_col.volleys
df_all_col['total_skill'] = df_all_col.dribbling + df_all_col.curve + df_all_col.free_kick_accuracy + df_all_col.long_passing + \
df_all_col.ball_control
df_all_col['total_movement'] = df_all_col.acceleration + df_all_col.sprint_speed + df_all_col.agility + \
df_all_col.reactions + df_all_col.balance
df_all_col['total_power'] = df_all_col.shot_power + df_all_col.jumping + df_all_col.stamina + df_all_col.strength + \
df_all_col.long_shots
df_all_col['total_mentality'] = df_all_col.aggression + df_all_col.interceptions + df_all_col.positioning + \
df_all_col.vision + df_all_col.penalties
df_all_col['total_defending'] = df_all_col.marking + df_all_col.standing_tackle + df_all_col.sliding_tackle
df_all_col['total_goalkeeping'] = df_all_col.gk_diving + df_all_col.gk_handling + df_all_col.gk_kicking + \
df_all_col.gk_positioning + df_all_col.gk_reflexes
print('df_all_col.columns: ', df_all_col.columns) # print column labels for all columns from both tables
print('df_all_col.shape:', df_all_col.shape)
print(df_all_col.info())
# identify non_numeric and numeric columns of interest and create two lists of column labels
non_numeric_col=['player_fifa_api_id', 'player_api_id','player_name', 'birthday', 'date', \
'preferred_foot', 'attacking_work_rate', 'defensive_work_rate']
numeric_col = ['age', 'height', 'weight','overall_rating', 'potential','crossing', 'finishing', \
'heading_accuracy','short_passing', 'volleys', 'dribbling', 'curve', \
'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', \
'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', \
'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',\
'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', \
'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes']
numeric_few_col = ['age', 'height', 'weight','overall_rating', 'potential', 'total_attack', 'total_skill', \
'total_movement', 'total_power', 'total_mentality', 'total_defending', 'total_goalkeeping']
df_all_col.replace(r'\s+', np.nan, regex=True, inplace = True)
df_all_col.dropna(axis=0, how='any', inplace=True) #drop row (sample) with any NA entry
df_all_col.sort_values('player_name',axis=0, inplace=True)
df_all_col.drop_duplicates(inplace=True)
df_all_col.to_csv('df_all_col.csv')
print ('df_all_col.shape: ', df_all_col.shape)
print(df_all_col.shape)
print(df_all_col.info())
print(df_all_col.head())
print(df_all_col.tail())
print(df_all_col['defensive_work_rate'][0:60]) # need more data cleaning for col before ploting
print ('======')
df_unscaled_data = df_all_col[numeric_col]
print('df_unscaled_data.columns:', df_unscaled_data.columns)
print('df_unscaled_data.shape:', df_unscaled_data.shape)
print('df_unscaled_data.info: ', df_unscaled_data.info())
scaled_data = preprocessing.scale(df_unscaled_data) #center and scale the data
print('scaled data:')
print (scaled_data) # preview scaled data
# create a PCA object.
# sklean uses this PCA object that can be trained using one dataset and applied to another dataset
pca = PCA()
print(type(pca))
# do PCA math, calculate loading scores and the variation each PCA accounts for
pca.fit(scaled_data)
# generate coordinates for a PCA graph based on the loading scores and the scaled data
pca_data = pca.transform(scaled_data)
# pca.explained_variance_ratio_ is <class 'numpy.ndarray'>.
# It calculates the percentage of variance that each principal component accoutns for
per_var = np.round(pca.explained_variance_ratio_*100, decimals =1)
print('=======================')
print('percent of explained variance: ')
print(per_var)
PC_labels = ['PC'+ str(x) for x in range(1,len(per_var)+1)] # labels for the Scree Plot: PC1, PC2 ...
print(' ')
print('=======================')
# create Scree Plot
plt.figure(figsize=(18, 6))
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=PC_labels)
plt.ylabel('Percentage of Explained Variance', fontsize='14')
plt.xlabel('Principal Component', fontsize='14')
plt.title('Scree Plot', fontsize='18')
plt.show()
plt.close()
# put pca_data with DataFrame with PC_labels
pca_df = pd.DataFrame(pca_data, index=None, columns=PC_labels)
print(pca_df.head()) # preview transformed and scaled
print('=======================')
print('Principal Components Scatter Matrix')
df_pc_matrix= pca_df[['PC'+ str(x) for x in range(1,21)]] # scatter matrix for PC1, PC2, ..., PC15
pd.plotting.scatter_matrix(df_pc_matrix, alpha=0.1, figsize=(14, 14), diagonal='kde',range_padding =0.1)
plt.tight_layout()
plt.show()
plt.close()
loading_scores_PC1 = pd.Series(pca.components_[0],index=numeric_col)
loading_scores_PC1_sorted = loading_scores_PC1.abs().sort_values(ascending=False)
print('Sorted PC1 Loading Scores (abs)')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index)
print(loading_scores_PC1_sorted)
print('=======')
loading_scores_PC2 = pd.Series(pca.components_[1],index=numeric_col)
loading_scores_PC2_sorted = loading_scores_PC2.abs().sort_values(ascending=False)
print('Sorted PC2 Loading Scores (abs)')
print('PC2 sorted components: ', loading_scores_PC2_sorted.index)
print(loading_scores_PC2_sorted)
print('=======')
loading_scores_PC3 = pd.Series(pca.components_[2],index=numeric_col)
loading_scores_PC3_sorted = loading_scores_PC3.abs().sort_values(ascending=False)
print('Sorted PC3 Loading Scores (abs)')
print('PC3 sorted components: ', loading_scores_PC3_sorted.index)
print(loading_scores_PC3_sorted)
Note: When plotting PC1 versus PC2 AND PC1 versus PC3, two clusters are displayed.
Next, we will visualize further in PC1, PC2 and PC3.
# draw PCA 2D plot: PC1 Vs PC2 and PC1 Vs PC3
def color_plot (i):
plt.figure(figsize=(10, 5))
plt.subplot(1,2,1)
plt.scatter(pca_df.PC1, pca_df.PC2, c=scaled_data[:,i], alpha=0.1)
plt.title("PCA Graph: PC1 Versus PC2 - color by " + numeric_col[i], fontsize='12')
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize='12')
plt.ylabel('PC2 - {0}%'.format(per_var[1]), fontsize='12')
plt.tight_layout()
plt.subplot(1,2,2)
plt.scatter(pca_df.PC1, pca_df.PC3, c=scaled_data[:,i], alpha=0.1)
plt.title("PCA Graph: PC1 Versus PC3 - color by " + numeric_col[i], fontsize='12')
plt.xlabel('PC1 - {0}%'.format(per_var[0]), fontsize='12')
plt.ylabel('PC3 - {0}%'.format(per_var[2]), fontsize='12')
plt.tight_layout()
plt.show()
plt.close()
for j in range(0,38):
color_plot (j)
#'attacking_work_rate', 'defensive_work_rate']
#plt.title('players with gk_diving > 40',loc='center')
def plot (df_all, df_sub, hue_col):
# first plot
vis1=sns.lmplot(x='ball_control', y='marking', hue=hue_col, sharex=False, data=df_all, scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('all players')
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()
plt.close()
# second plot: goalkeepers only
vis2=sns.lmplot(x='ball_control', y='marking', hue=hue_col, sharex=False, data=df_sub, scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('players with gk_diving > 40 (goalkeepers)')
plt.xlim(0,100)
plt.ylim(0,100)
plt.show()
plt.close()
print('gk_diving > 40 (goalkeepers)')
df_goalkeepers=df_all_col.loc[df_all_col['gk_diving']>40]
#print(df_goalkeepers.head())
plot(df_all_col, df_goalkeepers, None)
# color by lefty and righty
plot(df_all_col, df_goalkeepers, 'preferred_foot')
# plot lefty only
print('plot preferred left foot')
df1=df_all_col.loc[df_all_col['preferred_foot']=='left']
df2=df_goalkeepers.loc[df_goalkeepers['preferred_foot']=='left']
plot(df1,df2,None)
# color by 'attaching_work_rate'
df3=df_all_col[df_all_col['attacking_work_rate'].isin (['low','medium','high'])]
df4=df_goalkeepers[df_goalkeepers['attacking_work_rate'].isin (['low','medium','high'])]
plot(df3, df4, 'attacking_work_rate')
#plot jointplot with goal keeper attributes:
#'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'
def joint_plot (df, title) :
vis=sns.jointplot(x='ball_control',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_positioning', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='ball_control',y='gk_reflexes', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_positioning', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='marking',y='gk_reflexes', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
'''
vis=sns.jointplot(x='dribbling',y='gk_diving', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_handling', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_kicking', data=df, xlim=(0,100), ylim=(0,100), stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_positioning', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
vis=sns.jointplot(x='dribbling',y='gk_reflexes', xlim=(0,100), ylim=(0,100), data=df, stat_func=None)
plt.title(title, loc='left')
plt.show()
plt.close()
'''
# df_goalkeepers=df_all_col.loc[df_all_col['gk_diving']>40]
joint_plot (df_all_col, 'all players')
joint_plot (df_goalkeepers, 'gk_diving > 40 (goalkeepers)')
import seaborn as sns
df1=df_all_col[df_all_col['defensive_work_rate'].isin (['low','medium','high'])]
df2=df_all_col[df_all_col['defensive_work_rate'].isin (['high'])]
df3=df_all_col[df_all_col['defensive_work_rate'].isin (['medium'])]
df4=df_all_col[df_all_col['defensive_work_rate'].isin (['low'])]
def lmplot (df):
vis=sns.lmplot(x='marking', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
vis=sns.lmplot(x='standing_tackle', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
vis=sns.lmplot(x='sliding_tackle', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
vis=sns.lmplot(x='interceptions', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Colored By Defensive Work Rate')
plt.show()
plt.close()
lmplot(df1) #color by 'defensive_work_rate'].isin (['low','medium','high'])
lmplot(df2) #color by 'defensive_work_rate'].isin (['high'])
lmplot(df3) #color by 'defensive_work_rate'].isin (['medium'])
lmplot(df4) #color by 'defensive_work_rate'].isin (['low'])
df_totals=df_all_col[numeric_few_col]
pd.plotting.scatter_matrix(df_totals, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
total_cols = numeric_few_col + ['player_fifa_api_id', 'player_name']
df_t=df_all_col[total_cols]
df_t.to_csv('player_total_attributes.csv')
print(df_t.shape)
print (df_t.head())
vis=sns.lmplot(x='total_goalkeeping', y='overall_rating', hue = None, sharex=False, data=df_totals, \
scatter=True, fit_reg=False, units=None, order=1, legend=True)
plt.title('Total Goalkeeping Versus Overall Rating')
plt.show()
plt.close()
df_totals_gk = df_totals[df_totals['total_goalkeeping'] > 200]
corr_gk=df_totals_gk[['total_goalkeeping','overall_rating']].corr()
print ('A Closer Look at the Goalkeeper Subgroup on the Far Right')
print(' total_goalkeeping > 200 ')
print(' ')
print ('Correlation Coefficient between Overall Rating and Total Goalkeeping Attribute')
print(corr_gk)
Exactly –1. A perfect downhill (negative) linear relationship
• –0.70. A strong downhill (negative) linear relationship
• –0.50. A moderate downhill (negative) relationship
• –0.30. A weak downhill (negative) linear relationship
• 0. No linear relationship
# save correlation coefficient for dataset to csv
df_corr = df_unscaled_data.corr()
df_corr.to_csv('df_corr.csv')
print('Unscaled Data Scatter Matrix 1')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index[0:19])
#col_of_interest = ['ball_control', 'dribbling', 'short_passing', 'crossing', 'curve','long_shots', 'positioning', 'shot_power', 'vision', 'gk_diving','free_kick_accuracy', 'gk_reflexes', 'gk_handling']
col_of_interest = loading_scores_PC1_sorted.index[0:19]
df_col_of_interest= df_unscaled_data[col_of_interest] # scatter matrix for columns of interest
pd.plotting.scatter_matrix(df_col_of_interest, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
print('Unscaled Data Scatter Matrix 2')
print('PC1 sorted components: ', loading_scores_PC1_sorted.index[19:38])
#col_of_interest = ['ball_control', 'dribbling', 'short_passing', 'crossing', 'curve','long_shots', 'positioning', 'shot_power', 'vision', 'gk_diving','free_kick_accuracy', 'gk_reflexes', 'gk_handling']
col_of_interest = loading_scores_PC1_sorted.index[19:38]
df_col_of_interest= df_unscaled_data[col_of_interest] # scatter matrix for columns of interest
pd.plotting.scatter_matrix(df_col_of_interest, alpha=0.1, figsize=(16, 16), diagonal='kde',range_padding =0.01)
plt.tight_layout()
plt.show()
plt.close()
# create distribution plot for all features
final_col = ['player_fifa_api_id','preferred_foot','attacking_work_rate', 'defensive_work_rate'] + numeric_col
print(final_col)
df_final=df_all_col[final_col]
'''
df_final=def_all_col['player_fifa_api_id', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate', 'age', \
'height', 'weight', 'overall_rating', 'potential', 'crossing', 'finishing', 'heading_accuracy', \
'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', \
'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', \
'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', \
'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', \
'gk_kicking', 'gk_positioning', 'gk_reflexes']
'''
print(df_final.head())
print(len(final_col))
df_final.to_csv("df_final.csv")
# distplot for goalkeeping attributes
fig = plt.figure(figsize=(24,18))
ax1 = fig.add_subplot(321)
ax2 = fig.add_subplot(322)
ax3 = fig.add_subplot(323)
ax4 = fig.add_subplot(324)
ax5 = fig.add_subplot(325)
vis1=sns.distplot (df_all_col['gk_diving'], bins=30, ax=ax1)
vis2=sns.distplot (df_all_col['gk_handling'], bins=30, ax=ax2)
vis3=sns.distplot (df_all_col['gk_kicking'], bins=30, ax=ax3)
vis4=sns.distplot (df_all_col['gk_positioning'], bins=30, ax=ax4)
vis5=sns.distplot (df_all_col['gk_reflexes'], bins=30, ax=ax5)
plt.show()
plt.close()
# distplot for defending attributes
fig = plt.figure(figsize=(18,5))
ax6 = fig.add_subplot(131)
ax7 = fig.add_subplot(132)
ax8 = fig.add_subplot(133)
vis6=sns.distplot (df_all_col['marking'], bins=30, ax=ax6)
vis7=sns.distplot (df_all_col['standing_tackle'], bins=30, ax=ax7)
vis8=sns.distplot (df_all_col['sliding_tackle'], bins=30, ax=ax8)
plt.show()
plt.close()
# distplot
for i in range (4,42) :
sns.distplot (df_all_col[final_col[i]], bins=30)
plt.title('Distribution Plot')
plt.show()
plt.close()
print('A Closer Look at Overall Rating Versus Reactions Attributes')
print(' ')
corr1=df_all_col[['reactions','overall_rating']].corr()
print(corr1)
vis1=sns.lmplot( x='reactions', y='overall_rating', hue=None, sharex=False, data=df_all_col, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
df11=df_all_col[df_all_col['attacking_work_rate'].isin (['low','medium','high'])]
corr11=df11[['reactions','overall_rating']].corr()
print(corr11)
vis11=sns.lmplot( x='reactions', y='overall_rating', hue='attacking_work_rate', sharex=False, data=df11, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
df12=df_all_col[df_all_col['defensive_work_rate'].isin (['low','medium','high'])]
corr12=df12[['reactions','overall_rating']].corr()
print(corr12)
vis12=sns.lmplot( x='reactions', y='overall_rating', hue='defensive_work_rate', sharex=False, data=df12, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
df13=df_all_col[df_all_col['gk_diving'] > 40]
corr13=df13[['reactions','overall_rating']].corr()
print('gk_diving > 40 ')
print(corr13)
vis13=sns.lmplot( x='reactions', y='overall_rating', hue=None, sharex=False, data=df13, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
plt.close()
df14=df_all_col[df_all_col['gk_diving'] < 41]
corr14=df14[['reactions','overall_rating']].corr()
print('gk_diving < 41')
print(corr14)
vis14=sns.lmplot( x='reactions', y='overall_rating', hue=None, sharex=False, data=df14, scatter=True, fit_reg=True, units=None, order=1, legend=True)
plt.show()
plt.close()
PCA results shows that the first three principal components out of 38 dimensions, PC1, PC2 and PC2, explain 43.9%, 15.7% and 8.9%, respectively, which is a total of 68.5% of the variance in the final dataset of soccer players. Loading scores of all player attributes in the three principal components show that no one single player attribute contributes significantly more than any other attributes to these three principal components. Scatter matrix plots of first 20 principal components indicate presence of two distinct large and small subgroups of players in the PC1 dimension. Scatter plots, distribution plots, joinplots and correlation coefficient analysis explained the clusters and substantiated the answers to the research questions.
Some of the attributes, marking, standing tackle and sliding tackle, display bimodal distribution, which suggests that these attributes divide the players into two subgroups. Ditribution plots basing on goalkeeper attributes like gk_diving, gk_reflexes, gk_handling, gk_positioning and gk_kicking also show a large cluster with distinctively low scores and a much smaller subgroup with relatively high scores.
For correlation of overall rating with a single player attribute, reaction attribute has the strongest positive linear correlation. (coeff = 0.7248)
Player attributes are tallied by the following attribute categories for further analysis:
Analysis shows that overall rating very strong correlation (coeff = 0.978269) with total goalkeeping attribute categpry of the goalkeeper subgroup. This subgroup of players have total scores of all goalkeeper attributes greater than 200. It is reasonanle to beieve that this is a subgroup group of goalkeepers, who receive specialized drills on attributes that are important for goalkeeper position. As for the other two subgroups with total goalkeeping scores less than 200, overall rating and total goalkeeping scores do not correlate at all.
A closer look at the loading values for PC1 shows that, basically, no skill is much more influential than others.
Sorted Loading Scores for all attributes in PC1 dimension:
ball_control 0.233895
dribbling 0.226609
short_passing 0.220168
crossing 0.213529
curve 0.211442
long_shots 0.211257
positioning 0.204851
shot_power 0.199655
vision 0.197614
gk_diving 0.196600
free_kick_accuracy 0.196366
gk_reflexes 0.195234
gk_handling 0.195011
gk_positioning 0.193662
volleys 0.193077
finishing 0.191773
acceleration 0.185480
penalties 0.183370
sprint_speed 0.182038
long_passing 0.177636
agility 0.168734
stamina 0.158548
balance 0.153811
height 0.134852
heading_accuracy 0.132158
weight 0.125473
gk_kicking 0.121982
reactions 0.105162
aggression 0.087644
overall_rating 0.083033
potential 0.080174
interceptions 0.071469
standing_tackle 0.065230
sliding_tackle 0.059237
marking 0.052163
strength 0.028138
age 0.005573
jumping 0.001561